
Jarred van de Voort and Sean Coneys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
%matplotlib inline
#method used for preprocessing
from sklearn.preprocessing import StandardScaler
#models used for training/fitting data
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
import xgboost
#methods for training and optimizing model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score,accuracy_score,confusion_matrix, f1_score, precision_score, auc,roc_auc_score,roc_curve, precision_recall_curve
from sklearn.metrics import average_precision_score,confusion_matrix,precision_recall_curve,auc,roc_curve,recall_score,classification_report
#methods for resampling
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids,NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import BalanceCascade
#ensemble methods
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')
Using a public API, we're able to derive all transactions to and from a given address. The following dataset is series of transactions associated with a large ponzi scheme known as "Leancy".
df_ponzi = pd.read_csv('datasets/leancy_txhistory.csv')
df_ponzi['date'] = pd.to_datetime(df_ponzi['date'].astype(str), format='%Y%m%d')
df_ponzi.head(10)
To motivate this problem, let's take a look at the total number of transactions and BTC in/out to get a sense of scale.
print('NUM TRANSACTIONS: ' + str(len(df_ponzi)))
print('MAX BAL: BTC ' + str(df_ponzi['bal'].max()))
df_ponzi['desc'].value_counts()
At its peak, Leancy had 134 BTC, roughly the equivalent of 500,000 USD, and saw 24,000 transactions over the lifetime of the scam. The number of payments recieved indicates that thousands of investors were involved. To get a better sense of the flow of currency in and out, we can plot balance and transactions over time.
plt.figure(figsize=(10,6))
date_range = (df_ponzi['date'] > '2013-1-1') & (df_ponzi['date'] <= '2014-4-1')
df_daterange = df_ponzi[date_range]
plt.title("Balance over time")
plt.xlabel("Date")
plt.ylabel("BTC")
plt.plot(df_daterange['date'], df_daterange['bal'])
The figure above shows frequent fluctuation in the total balance of the address, which is consistent with the behaviour that ponzi schemes use funds from new investors to pay old investors at a high cadence.
plt.figure(figsize=(10,6))
counts = df_daterange['date'].value_counts(sort=False)
plt.title("# Transactions per month")
plt.xlabel("Month")
plt.ylabel("# Transactions")
plt.bar(counts.index,counts)
plt.show()
Similarly, the number of transactions scales as the number of investors continues to increase. The scheme implodes when no more new funding can be secured to pay off old investors. The graph shows that after the third month, the number of transactions drops off significantly, indicating the collapse of the Leancy ponzi scheme.
plt.figure(figsize=(10,6))
date_range = (df_ponzi['date'] > '2014-1-1') & (df_ponzi['date'] <= '2014-4-1')
df_inout = df_ponzi[date_range]
plt.title("BTC In vs. Out")
plt.xlabel("Date")
plt.ylabel("BTC")
plt.plot(df_inout['date'], df_inout['btc_out'],'r',df_inout['date'], df_inout['btc_in'],'g')
Separating BTC in vs. out tells us much of the same story. In this graph we can see that in the days leading up to the collapse, the amount of incoming BTC fell below the mark needed sustain the next level of growth
The first step in collecting data associated with ponzi schemes required an incredibly manual process of extracting data from several of forums that advertised "high yield investment programs" disguised as ponzi schemes. Some of these sites include bitcoinwhoiswho.com, bitcointalk.org, and reddit.com. The other class of non-ponzi schemes consists of 3000 randomly chosen addresses derived from this study.
df = pd.read_csv('datasets/final_aggregated_dataset.csv')
df.head(10)
df = df.drop('address', 1)
X = df.iloc[:, df.columns != 'class']
y = df.iloc[:, df.columns == 'class']
Since we manually calculated our features, most of the data cleaning is already handled for us. The full feature calculation pipeline can be viewed in the project repository.
Let's take a look at our data by visualizing the distrubution of some of our features:
ax = sns.distplot(df['lifetime']);
ax.set_title('Wallet Lifetime Distribution')
ax.set_xlabel('Lifetime')
ax.set_ylabel('Frequency')
ax = sns.distplot(df['in_vs_out'], color = '#551a8b');
ax.set_title('In vs. Out')
ax.set_xlabel('Lifetime')
ax.set_ylabel('Total in_vs_out')
Shown above we have the distributions for lifetime and in/out of an address. It's clear that our data suffers from a high amount of skew. We can use a pairplot to visualize skewness in our feature matrix.
sns.pairplot(data=df, hue = 'class')
As suspected, most of our features are highly skewed, which will likely affect performance during training. Let's investigate the level of skewness for each feature using scipy's skew package.
from scipy.stats import skew
skewed_feats = X[X.dtypes.index].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_df = pd.DataFrame({'Skew' :skewed_feats})
skewed_df.head(16)
We can take the log of our features to help rebalance our feature distributions. In this case, we'll use numpy's log1p which returns $log(1 + x)$ where $x$ is the feature value:
X[skewed_df.index] = np.log1p(X[skewed_df.index])
To see the effect of this transformation, let's visualize our feature matrix once again:
df = pd.concat([X, y], axis=1)
sns.pairplot(data=df, hue = 'class')
By comparing our feature matrix before and after transforming our feature values, we can see that this transformation provides more seperable data with respect to our two different classes. The matrix below derives the correlation coefficient for each cell in our feature matrix, shedding insight on which features may yield more/less predictive power.
corrmat = df.corr()
f, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(corrmat, vmax = .8, vmin = -.8, cbar=True, annot=False, square=True, cmap = "RdBu_r");
plt.show()
X = df.iloc[:, df.columns != 'class']
y = df.iloc[:, df.columns == 'class']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)
scaled_features = StandardScaler().fit_transform(X.values)
scaled_features_df = pd.DataFrame(scaled_features, index=X.index, columns=X.columns)
scaled_features_df.head()
Now that we have a more balanced, standardized dataset, we can begin to handle issues with class imbalance, which poses significant challenges in the training and tuning of our model. The figure below shows the ratio of ponzi schemes to non ponzi.
sns.countplot("class",data=df)
num_norm_tx = len(df[df["class"]==0])
num_ponzi_tx = len(df[df["class"]==1])
norm_tx_per = num_norm_tx/(num_norm_tx+num_ponzi_tx)
ponzi_tx_per = num_ponzi_tx/(num_norm_tx+num_ponzi_tx)
print("Ponzi instances: ", num_ponzi_tx)
print("Normal instances: ", num_norm_tx)
print("-----")
print("Ponzi TX %: ", num_ponzi_tx/(num_norm_tx+num_ponzi_tx)*100)
print("Normal TX %: ", num_norm_tx/(num_norm_tx+num_ponzi_tx)*100)
As seen above, our dataset is highly imbalanced with only a few instances of the minority class. This means we have to take extra consideration when specifying metrics used to determine the success of our model. A naive approach would find a consistently high level of accuracy, in which the model is always predicting the majority class.
The confusion matrix below illustrates how these metrics are associated with classifying ponzi schemes.
$TP = Correctly\space identified\space ponzi$
$FP = Incorrectly\space identified\space non-ponzi\space as\space ponzi$
$FN = Incorrectly\space identified\space ponzi\space as\space non-ponzi$
$TN = Correctly\space identified\space non-ponzi$
$Accuracy = \frac{tp + tn}{tp + tn + fp + fn}$ $Precision = \frac{tp}{tp+fp}$ $Recall = \frac{tp}{tp+fn}$
Instead, we'll use recall as our metric for determining the performance of our model. Recall is best suited for situations where there is a high cost associated with false negative, or in this case, failing to identify a ponzi scheme.
Imbalanced classes provide a variety of challenges when training a model. We can address class imbalance in the following ways:
The figure below gives a visual representation of different resampling techniques shape rebalance classes.

While the process of balancing classes provides considerable advantages during training, these techniques, as with most, have a tradeoff. For example, if we were to use a basic technique of replicating minority instances using over-sampling, our model would become prone to overfitting. On the other hand, undersampling the majority class reduces the amount of information that can be used to train our model. Therefore, we must adjust and tune our model accordingly.
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_train_us, y_train_us = rus.fit_resample(X_train, y_train)
num_fraud = (y_train_us == 1).sum()
num_norm = (y_train_us == 0).sum()
num_total = y_train.sum()
print("Ponzi instances: ", num_fraud)
print("Normal instances: ", num_norm)
print("-----")
print("Resampled Ponzi TX %: ", num_fraud/(num_fraud+num_norm)*100)
print("Resampled Normal TX %: ", num_norm/(num_fraud+num_norm)*100)
SMOTE (Synthetic Minority Over-sampling Technique) is a method that synthesizes new members of the minority class by extrapolating features based on prior minority class instances and its k nearest neighbors. In traditional methods of oversampling, instances of the minority class are duplicated which is prone to overfitting, whereas SMOTE rebalances our classes with artificially generated data that has been shown to suffer less from overfitting.
The SMOTE algorithm first obtains the k-nearest neighbors of a sample of the minority class $x_o$, then randomly select a minority class sample amongst k neighbors $\bar{x_i}$. Using these two samples, the algorithm then generates a new instance by interpolating between the two:
$x_{new} = x_o + rand(0,1) \times (x_i - x_o) $
As a result, the new generated instances cause the classifier to expand the deicision region for the minority class.
We could also extend SMOTE to nominal features using the value difference metric (VDM) to measure the distance between two feature values, where:
$V_n$ = feature value
$C_n$ = frequency of $V_n$
$C_{ni}$ = frequency of $V_n$ for class $i$
$\delta(V_1, V_2) = \sum\limits_{i=1}^n \left|\frac{C_{1i}}{C_1} - \frac{C_{2i}}{C_2}\right|^k $
Using VDM, we can then calculate the distance between two feature vectors: $r=1$ yields Manhattan distance, $r=2$ yields Euclidean distance:
$\Delta(X,Y) = w_xw_y \sum\limits_{i=1}^N \delta(x_i,y_i)^r$
Image source
SMOTE publication by Chawla et al.
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_os, y_train_os = sm.fit_resample(X_train, y_train)
num_fraud = (y_train_os == 1).sum()
num_norm = (y_train_os == 0).sum()
num_total = y_train_os.sum()
print("Ponzi instances: ", num_fraud)
print("Normal instances: ", num_norm)
print("-----")
print("Resampled Ponzi TX %: ", num_fraud/(num_fraud+num_norm)*100)
print("Resampled Normal TX %: ", num_norm/(num_fraud+num_norm)*100)
To get a sense of how effective resampling really is, we can take a basic linear regression model and compare the perfomance on our standard, undersampled, and oversampled dataset. We will define a robust function allows us to test individual models on different sampling techinques, and then visualize its performance using the confusion matrix and the precision-recall curve. Generally speaking, the higher the area under the PRC, the better overall performance of the model.
def modelReport(name, sampling, model,X_train,X_test,y_train,y_test,isTuned, isEnsemble):
if(not isTuned): #If the model is not already tuned, we need to fit our model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if(isEnsemble):
y_score = model.predict_proba(X_test)[:,1] #predict_proba gives us a probability matrix for our classes
else:
y_score = model.decision_function(X_test)
print("\n----------- " + name + " w/ " + sampling + " -----------")
#we'll use a confusion matrix to visualize how well each model performs
fig= plt.figure(figsize=(6,3))
plt.title("Confusion_matrix")
plt.xlabel("Predicted_class")
plt.ylabel("Real class")
sns.heatmap(confusion_matrix(y_test,y_pred),cmap="RdBu_r",annot=True,linewidths=0.5,fmt='g')#
plt.show()
print('Recall: ',recall_score(y_test,y_pred))
print('Precision:',precision_score(y_test,y_pred))
print('AUC: ',average_precision_score(y_test,y_score))
#sklearn comes with some nifty prebuilt functions for getting the recall and precision of our model
precision, recall, _ = precision_recall_curve(y_test, y_score)
#we can plot the PRC curve to view overall model performance
plt.step(recall, precision, alpha=0.6, color='b', where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', step='post')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0, 1.05])
plt.xlim([0, 1])
plt.title('Precision-Recall curve')
plt.show()
#classification report is a package in sklearn that gives a nice print out of metrics for each class
print(classification_report(y_test,y_pred))
return([name, sampling, '{0:.2g}'.format(recall_score(y_test,y_pred)), '{0:.2g}'.format(average_precision_score(y_test,y_score))])
As we might have expected, using a dataset with no form of rebalancing yields relatively low performance.
To reiterate the importance of metrics, this model has an accuracy of over 90%!! However, we now know that high accuracy in imbalanced datasets is smoke and mirrors. Our recall, on the other hand, is an abysmal 3%. Let's see how resampling techniques affects our recall.
baseModel = LogisticRegression()
modelReport("LR", "No sampling", baseModel, X_train,X_test,y_train,y_test, isTuned = False, isEnsemble = False)
Even with a straightforward random undersampling, we see an astronomical improvement in our level of recall. We were able to classify most ponzi schemes. On the other hand, our model incorrectly classified a high amount of instances as ponzi schemes.
us_model = LogisticRegression()
modelReport("LR", "Undersampling", us_model, X_train_us, X_test, y_train_us, y_test, isTuned = False, isEnsemble = False)
Oversampling gives us much of the same, a higher level of recall at the expense of misclassifying a large amount of non-ponzi schemes. It's pretty clear that resampling our dataset will provides substantial improvement in the training of our model. In the next section we will examine the effectiveness of different models on resampled datasets.
os_model = LogisticRegression()
modelReport("LR", "SMOTE Oversampling",os_model, X_train_os, X_test, y_train_os, y_test, isTuned = False, isEnsemble = False)
Now that we've demonstrated the effectiveness of resampling, we can work with different models and sampling technique combinations to find the best performance for this classification task.
To streamline things, we'll keep a list of our models and resampling methods:
models = []
models.append(['LR',LogisticRegression(random_state=2),False]) #We've added a boolean for handling ensembles when it comes time to fit
models.append(['SVM',SVC(random_state=2,gamma='scale'),False])
models.append(['ADA',AdaBoostClassifier(random_state=2),True])
models.append(['QDA',QuadraticDiscriminantAnalysis(),False])
models.append(['RF', RandomForestClassifier(random_state=2,n_estimators=100),True])
models.append(['GB', GradientBoostingClassifier(random_state=2),True])
Similar to before, we'll use the imblearn package to transform our dataset using various resampling techniques, each offering their own tradeoffs, which we'll investigate.
def resample(name,method,X_train,y_train):
X__train_resampled, y__train_resampled = method.fit_sample(X_train,y_train)
return [name, pd.DataFrame(X__train_resampled), pd.DataFrame(y__train_resampled)]
datasets = []
datasets.append(["No sampling", X_train, y_train])
datasets.append(resample("RUS",RandomUnderSampler(),X_train,y_train))
datasets.append(resample("NM",NearMiss(n_jobs=-1),X_train,y_train))
datasets.append(resample("CC",ClusterCentroids(n_jobs=-1),X_train,y_train))
datasets.append(resample("ROS",RandomOverSampler(),X_train,y_train))
datasets.append(resample("SMOTE",SMOTE(n_jobs=-1),X_train,y_train))
datasets.append(resample("SNN",SMOTEENN(),X_train,y_train))
datasets.append(resample("STK",SMOTETomek(),X_train,y_train))
We'll keep track of scores for recall and AUC for our PRC curve to get a sense of how well each model/sampling technique performs. Using our modelReport function we defined earlier, we'll get a detailed report for the performance of each model. Having a scroll through the confusion matrices, we can see that some definitely perform better than others.
#lists to keep track of scoring
recall_scores = []
auc_scores = []
#we iterate through each type of model in our models list
for model in models:
recall_scores.append([model[0]])
auc_scores.append([model[0]])
#then we iterate through each resampled dataset to generate a performance report for each model
for dataset in datasets:
scores = modelReport(model[0], dataset[0], model[1], dataset[1], X_test, dataset[2], y_test, False, model[2])
recall_scores[-1].append(scores[2])
auc_scores[-1].append(scores[3])
To give a better overview each model's performance, we can view our performance matrix for recall and AUC of our PRC. As you can see have a few instances of perfect recall! Which means our model was able to correctly classify every ponzi scheme. We also know that this may be due to overfitting, which we'll explore more in the following section.
Looking at some of the instances of high recall, we also see that the PRC AUC can be quite low, which indicates that the model generated high amount of false positives. In the context of bitcoin, this may not be the worst thing as it would give the FBI, or any kind of regulatory agency, a list of addresses for further inspection that would contain all actual ponzi schemes.
We certainly dont want to make the FBI sift through hundreds of false positives, so we will tune our model using hyperparameters while checking for any overfitting.
print("Recall Scores:")
column_headers = ["None", "RUS", "NM", "CC", "ROS", "SMOTE", "SNN", "STK"]
rec_scores = np.array(recall_scores)
rec = pd.DataFrame(data=rec_scores[0:,1:],index=rec_scores[:,:1].flatten('F'),columns=column_headers)
rec.head(6)
print("AUC Scores:")
auc_scores = np.array(auc_scores)
auc = pd.DataFrame(data=auc_scores[0:,1:],index=auc_scores[:,:1].flatten('F'),columns=column_headers)
auc.head(6)
We've seen that some models can actually perform pretty well, even classifying all of the ponzis schemes correctly. But before we determine which model/resampling technique is best, it's a good idea to use cross validation to investigate any overfitting/underfitting.
Overfitting tends to occur when the difference between the training score and the cross validation score is high due to high variance.
Underfitting happens when both our training and cross-validation scores are both low
We also have to be careful when cross validating with resampled data. If we were to cross validate using resampled, we would find artifically high scores for our model the validation set before cross validating. Instead we resample each fold during the cross validation process. To better demontrate this I've included a diagram, on the left is the incorrect process of cross validating before resampling which leads to overfitting. On the right, we resample while cross validating.


We'll implement cross validation using stratified shuffle split, which allows us to resample each split during the process, at the same time we'll tune our model to achieve the highest level of recall.
def modelTune(model, param_grid, X_train, y_train, scoring = 'recall'):
#model tune tests for the best model given a set of parameters and scoring metric
best_model = GridSearchCV(model, param_grid, scoring = scoring, cv = 5, n_jobs = -1)
best_model.fit(X_train, y_train)
return best_model.best_estimator_
#Since we've already transformed our datasets,
#we've regenerated a list of our sampling method objects for cross validating appropriately
samplingMethods = []
samplingMethods.append(["RUS",RandomUnderSampler()])
samplingMethods.append(["NM",NearMiss(n_jobs=-1)])
samplingMethods.append(["CC",ClusterCentroids(n_jobs=-1)])
samplingMethods.append(["ROS",RandomOverSampler()])
samplingMethods.append(["SMOTE",SMOTE(n_jobs=-1)])
samplingMethods.append(["SNN",SMOTEENN()])
samplingMethods.append(["STK",SMOTETomek()])
def cross_validate(model, modelname, sampler, samplername, params, X, y):
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) #we initialize our SSS object
#lists to keep track of scoring
precision_scores = []
recall_scores = []
auc_scores = []
for train_index, test_index in sss.split(X, y):
#using indices for each split we generate a new test and train set
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
#for each new train/test set, we resample
X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
#finally we retreive the best model for recall
best_model = modelTune(model, params, X_train_res, y_train_res, scoring = 'roc_auc')
y_pred = best_model.predict(X_test)
precision_scores.append(precision_score(y_test, y_pred))
recall_scores.append(recall_score(y_test, y_pred))
auc_scores.append(average_precision_score(y_test,y_pred))
print('-------' + modelname + " w/ " + samplername + '--------')
print("Recall: {}".format(np.mean(recall_scores)))
print("Precision: {}".format(np.mean(precision_scores)))
print("AUC: {}".format(np.mean(auc_scores)))
print('\n')
QDA is a bayesian algorithm that operates using a quadratic decision surface as implied by the name. In our previous tests, QDA showcased high levels of performance so its a good candidate to examine for any potential overfitting.
model = QuadraticDiscriminantAnalysis()
params = {
'reg_param' : [0.0, 0.001, 0.01, 0.05, 0.1, .5],
'tol' : [0.0, 0.0001, 0.001, 0.01,0.1]
}
qda_recall_scores = []
qda_auc_scores = []
for dataset in datasets:
best_model = modelTune(model, params, dataset[1], dataset[2], scoring = 'recall')
scores = modelReport("QDA", dataset[0], best_model, dataset[1],X_test,dataset[2],y_test, True, True)
qda_recall_scores.append(scores[2])
qda_auc_scores.append(scores[3])
print("QDA Recall Scores")
print(column_headers)
print(np.array(qda_recall_scores))
print("QDA AUC Scores")
print(column_headers)
print(np.array(qda_auc_scores))
Comparing our recall and cross validated scores we see that a highly tuned QDA model is certainly prone to overfitting, and may not be our best choice.
model = QuadraticDiscriminantAnalysis()
params = {
'reg_param' : [0.0, 0.001, 0.01, 0.05, 0.1, .5],
'tol' : [0.0, 0.0001, 0.001, 0.01,0.1]
}
for sampler in samplingMethods:
cross_validate(model,"QDA", sampler[1], sampler[0], params, X_train.values, y_train.values)
In recent years, random forest has shown to be a strong classifier within the machine learning community. We'll tune our RF model and compare it with our cross validation scores to gauge how well the model generalizes.
model = RandomForestClassifier(random_state=2,n_estimators=100)
params = {
"criterion": ["gini", "entropy"],
"max_depth": list(range(2,4,1)),
"min_samples_leaf": list(range(5,7,1))
}
#name, sampling, model,X_train,X_test,y_train,y_test,isTuned, isEnsemble
rf_recall_scores = []
rf_auc_scores = []
for dataset in datasets:
best_model = modelTune(model, params, dataset[1], dataset[2], scoring = 'recall')
scores = modelReport("RF", dataset[0], best_model, dataset[1],X_test,dataset[2],y_test, True, True)
rf_recall_scores.append(scores[2])
rf_auc_scores.append(scores[3])
print("RF Recall Scores")
print(column_headers)
print(np.array(rf_recall_scores))
print("RF AUC Scores")
print(column_headers)
print(np.array(rf_auc_scores))
Now that we have our tuned model, let's compare it with our cross validation scores. Here we see that our recall and cross validated recall gravitate towards each other, indicating that our model is less prone to overfitting and is likely a good candidate. If we were to choose a model / resampling technique that works best, random forest decision classifier using random undersampling demonstrates a consistently high level of recall while maintaining a considerable level of precision.
model = RandomForestClassifier(random_state=2,n_estimators=100)
params = {
"criterion": ["gini", "entropy"],
"max_depth": list(range(2,4,1)),
"min_samples_leaf": list(range(5,7,1))
}
for sampler in samplingMethods:
cross_validate(model,"RF", sampler[1], sampler[0], params, X_train.values, y_train.values)